Application 6: VisualizingCorrelations and Models

Author

Erik Westlund

Published

June 12, 2025

# List of required packages
required_packages <- c(
  "dplyr",
  "ggplot2",
  "broom.mixed",
  "lme4",
  "readr"
)

# Install missing packages
new_packages <- required_packages[!(required_packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load all packages
library(dplyr)
library(ggplot2)
library(broom.mixed)
library(lme4)
library(readr)

source(here::here("examples", "colors.R"))
set.seed(123)

Looking At Data: Correlations

Correlations are indisposable for understanding the relationship between two variables, but they can be misleading as we show below.

This is adapted directly from Jan Vanhove

plot_r <- function(df, showSmoother = TRUE, smoother = "lm") {
  p <- ggplot(df, aes(x = x, y = y)) +
    geom_point(alpha = 0.7)
  
  if(showSmoother) {
    p <- p +
    geom_smooth(
      formula = y ~ x,
      method = smoother,
      color = colors$green$`500`,
      fill = colors$slate$`300`,
      alpha = 0.3,
    )
  }

  p <- p +
    facet_wrap(~title, scales = "free", ncol = 2) +
    theme_minimal(base_size = 12) +
    theme(
      strip.text = element_text(face = "bold", size = 14),
      axis.title = element_blank(),
      plot.title = element_text(size = 24, face = "bold", hjust = 0.5),
      panel.grid.minor = element_blank(),
      axis.text = element_text(size = 10),
      panel.spacing = unit(2, "lines")
    )
  
  p
}

corr_r <- function(r = 0.6, n = 50) {
  
  compute.y <- function(x, y, r) {
    theta <- acos(r)
    X <- cbind(x, y)
    Xctr <- scale(X, center = TRUE, scale = FALSE)    # Centered variables (mean 0)
    Id <- diag(n)                                     # Identity matrix
    Q <- qr.Q(qr(Xctr[, 1, drop = FALSE]))            # QR decomposition
    P <- tcrossprod(Q)                                # Projection onto space defined by x1
    x2o <- (Id - P) %*% Xctr[, 2]                     # x2ctr made orthogonal to x1ctr
    Xc2 <- cbind(Xctr[, 1], x2o)
    Y <- Xc2 %*% diag(1 / sqrt(colSums(Xc2 ^ 2)))
    y <- Y[, 2] + (1 / tan(theta)) * Y[, 1]
    return(y)
  }
  
  cases <- list(
    list(id = 1, title = "(1) Normal x, normal residuals", x = rnorm(n), y = rnorm(n)),
    list(id = 2, title = "(2) Uniform x, normal residuals", x = runif(n, 0, 1), y = rnorm(n)),
    list(id = 3, title = "(3) +-skewed x, normal residuals", x = rlnorm(n, 5), y = rnorm(n)),
    list(id = 4, title = "(4) --skewed x, normal residuals", x = rlnorm(n, 5) * -1 + 5000, y = rnorm(n)),
    list(id = 5, title = "(5) Normal x, +-skewed residuals", x = rnorm(n), y = rlnorm(n, 5)),
    list(id = 6, title = "(6) Normal x, --skewed residuals", x = rnorm(n), y = -rlnorm(n, 5)),
    list(id = 7, title = "(7) Increasing spread", 
         x = sort(rnorm(n)) + abs(min(rnorm(n))), 
         y = rnorm(n, 0, sqrt(abs(10 * sort(rnorm(n)))))),
    list(id = 8, title = "(8) Decreasing spread", 
         x = sort(rnorm(n)) + abs(min(rnorm(n))), 
         y = rnorm(n, 0, sqrt(pmax(0.1, abs(10 * max(sort(rnorm(n))) - 10 * sort(rnorm(n))))))),
    list(id = 9, title = "(9) Quadratic trend", x = rnorm(n), y = rnorm(n) ^ 2),
    list(id = 10, title = "(10) Sinusoid relationship", x = runif(n, -2 * pi, 2 * pi), y = sin(runif(n, -2 * pi, 2 * pi))),
    list(id = 11, title = "(11) A single positive outlier", x = c(rnorm(n - 1), 10), y = c(rnorm(n - 1), 15)),
    list(id = 12, title = "(12) A single negative outlier", x = c(rnorm(n - 1), 10), y = c(rnorm(n - 1), -15)),
    list(id = 13, title = "(13) Bimodal residuals", x = rnorm(n), y = c(rnorm(floor(n / 2), mean = -3), rnorm(ceiling(n / 2), 3))),
    list(id = 14, title = "(14) Two groups", 
         x = c(rnorm(floor(n / 2), -3), rnorm(ceiling(n / 2), 3)), 
         y = c(rnorm(floor(n / 2), mean = 3), rnorm(ceiling(n / 2), mean = -3))),
    list(id = 15, title = "(15) Sampling at the extremes", 
         x = c(rnorm(floor(n / 2)), rnorm(ceiling(n / 2), mean = 10)), 
         y = rnorm(n)),
    list(id = 16, title = "(16) Categorical data", 
         x = sample(1:5, n, replace = TRUE), 
         y = sample(1:7, n, replace = TRUE))
  )
  
  df <- bind_rows(lapply(cases, function(case) {
    id = case$id
    x <- case$x
    y <- compute.y(x, case$y, r)
    data.frame(id = id, x = x, y = y, title = case$title)
  }))
  
  df$title <- factor(df$title, levels = paste0("(", 1:16, ") ", c(
    "Normal x, normal residuals",
    "Uniform x, normal residuals",
    "+-skewed x, normal residuals",
    "--skewed x, normal residuals",
    "Normal x, +-skewed residuals",
    "Normal x, --skewed residuals",
    "Increasing spread",
    "Decreasing spread",
    "Quadratic trend",
    "Sinusoid relationship",
    "A single positive outlier",
    "A single negative outlier",
    "Bimodal residuals",
    "Two groups",
    "Sampling at the extremes",
    "Categorical data"
  )))
  

  return(df)
}


data <- corr_r(r=0.3, n=100)

analysis_data <- data |> filter(id == 1)
model <- lm(y ~ x, data = analysis_data)
summary(model)

Call:
lm(formula = y ~ x, data = analysis_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.19848 -0.07113 -0.00910  0.06042  0.34241 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept) -0.00313    0.01015  -0.308  0.75846   
x            0.03463    0.01112   3.113  0.00243 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.101 on 98 degrees of freedom
Multiple R-squared:   0.09, Adjusted R-squared:  0.08071 
F-statistic: 9.692 on 1 and 98 DF,  p-value: 0.002426
cor(analysis_data$x, analysis_data$y)
[1] 0.3
plot_r(data, showSmoother=FALSE)

plot_r(data, showSmoother=TRUE)

Visualizing Model Outputs

# Load data
data <- read_csv(here::here("data", "processed", "simulated_data.csv"))
Rows: 50000 Columns: 20
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (6): state, self_report_income, edu, race_ethnicity, insurance, job_type
dbl (14): id, provider_id, received_comprehensive_postnatal_care, age, depen...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Sample 1/4 of the sites
set.seed(123)
unique_sites <- unique(data$provider_id)
reduced_sites <- sample(unique_sites, length(unique_sites) * 0.10)
data <- data[data$provider_id %in% reduced_sites, ]

# Fit the model
model <- glmer(
  received_comprehensive_postnatal_care ~ 
    race_ethnicity +
    log(distance_to_provider) +
    insurance +
    multiple_gestation + 
    placenta_previa + 
    gest_hypertension + 
    preeclampsia +
    (1 | provider_id),  
  data = data, 
  family = binomial(link = "logit"),
  control = glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 1e5))
)

# Create variable labels for better visualization
variable_labels <- c(
  "race_ethnicityaian" = "AIAN",
  "race_ethnicityasian" = "Asian",
  "race_ethnicityblack" = "Black",
  "race_ethnicityhispanic" = "Hispanic",
  "race_ethnicitynhpi" = "NHPI",
  "race_ethnicityother" = "Other",
  "log(distance_to_provider)" = "Log(Distance to Provider)",
  "insuranceprivate" = "Insurance: Private",
  "insurancestate_provided" = "Insurance: State-Provided",
  "multiple_gestation" = "Multiple Gestation",
  "placenta_previa" = "Placenta Previa",
  "gest_hypertension" = "Gestational Hypertension",
  "preeclampsia" = "Preeclampsia",
  "(Intercept)" = "Intercept"
)
# Get fixed effects with confidence intervals
fixed_effects <- tidy(model, conf.int = TRUE) %>%
  filter(effect == "fixed") %>%
  # Remove any NA values
  filter(!is.na(estimate)) %>%
  # Create a more readable term name
  mutate(term = case_when(
    term == "(Intercept)" ~ "Intercept",
    term == "age" ~ "Age",
    term == "sexMale" ~ "Male Sex",
    term == "raceBlack" ~ "Black Race",
    term == "raceHispanic" ~ "Hispanic Race",
    term == "raceOther" ~ "Other Race",
    term == "insurancePrivate" ~ "Private Insurance",
    term == "insuranceMedicaid" ~ "Medicaid",
    term == "insuranceOther" ~ "Other Insurance",
    term == "comorbidity_score" ~ "Comorbidity Score",
    term == "emergency_admissionTRUE" ~ "Emergency Admission",
    term == "weekend_admissionTRUE" ~ "Weekend Admission",
    term == "night_admissionTRUE" ~ "Night Admission",
    TRUE ~ term
  ))

# Create the forest plot
ggplot(fixed_effects, aes(x = estimate, y = reorder(term, estimate))) +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), 
                 height = 0.2, color = colors$slate$`300`) +
  geom_point(size = 3, color = colors$blue$`500`) +
  geom_vline(xintercept = 0, linetype = "dashed", color = colors$red$`600`) +
  labs(
    title = "Fixed Effects from GLMM",
    subtitle = "Each point represents the estimated effect on log-odds of receiving comprehensive care",
    x = "Effect Size (95% CI)",
    y = "Predictor"
  ) +
  theme_minimal() +
  theme(
    panel.grid.major.y = element_blank(),
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 10)
  )

# Get random effects
random_effects <- ranef(model, condVar = TRUE)
random_effects_data <- as.data.frame(random_effects)

# Create the forest plot of random effects
ggplot(random_effects_data, aes(x = condval, y = reorder(grp, condval))) +
  geom_errorbarh(aes(xmin = condval - 1.96*sqrt(attr(random_effects$provider_id, "postVar")[1,1,]), 
                     xmax = condval + 1.96*sqrt(attr(random_effects$provider_id, "postVar")[1,1,])), 
                 height = 0.2, color = colors$slate$`300`) +
  geom_point(size = 3, color = colors$blue$`500`) +
  geom_vline(xintercept = 0, linetype = "dashed", color = colors$red$`600`) +
  labs(
    title = "Random Effects by Provider",
    subtitle = "Each point represents the provider-specific deviation from the overall intercept",
    x = "Provider-Specific Effect (95% CI)",
    y = "Provider ID"
  ) +
  theme_minimal() +
  theme(
    panel.grid.major.y = element_blank(),
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 10)
  )

# Print model summary
summary(model)
Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: 
received_comprehensive_postnatal_care ~ race_ethnicity + log(distance_to_provider) +  
    insurance + multiple_gestation + placenta_previa + gest_hypertension +  
    preeclampsia + (1 | provider_id)
   Data: data
Control: glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 1e+05))

      AIC       BIC    logLik -2*log(L)  df.resid 
   6183.2    6280.7   -3076.6    6153.2      4913 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-1.1303 -0.7272 -0.6137  1.2288  2.3987 

Random effects:
 Groups      Name        Variance Std.Dev.
 provider_id (Intercept) 0.1589   0.3987  
Number of obs: 4928, groups:  provider_id, 50

Fixed effects:
                           Estimate Std. Error z value Pr(>|z|)  
(Intercept)               -0.905542   0.391365  -2.314   0.0207 *
race_ethnicityasian        0.248827   0.402795   0.618   0.5367  
race_ethnicityblack       -0.059232   0.395410  -0.150   0.8809  
race_ethnicityhispanic     0.055657   0.388852   0.143   0.8862  
race_ethnicitynhpi         0.004071   0.700685   0.006   0.9954  
race_ethnicityother        0.279441   0.544121   0.514   0.6076  
race_ethnicitywhite        0.196744   0.384417   0.512   0.6088  
log(distance_to_provider) -0.023705   0.023838  -0.994   0.3200  
insuranceprivate           0.139653   0.074399   1.877   0.0605 .
insurancestate_provided    0.045824   0.082829   0.553   0.5801  
multiple_gestation         0.175905   0.178027   0.988   0.3231  
placenta_previa            0.309438   0.315725   0.980   0.3270  
gest_hypertension          0.035563   0.137519   0.259   0.7959  
preeclampsia               0.126002   0.224375   0.562   0.5744  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation matrix not shown by default, as p = 14 > 12.
Use print(x, correlation=TRUE)  or
    vcov(x)        if you need it